Bacteremia data analysis

Author

Michael Kammer

Published

2025-04-22

1 Background

This frequentist analysis follows the statistical analysis plan by Michael Kammer dated 27.5.2024 (see version history). IDA and descriptives are presented in an overview notebook.

As of 29.8.2024 it was decided to conduct this analysis also for only one third of the dataset for this example, to make the tradeoff between model complexity and prediction performance more apparent. This notebook contains results for the params$set dataset.

2 Setup and data

Data were downloaded from Zenodo and stored locally for ease of use.

Processing steps:

  • Convert BloodCulture to numeric
  • Reduce the dataset by using only the first 4000 observations
Code
T_START = Sys.time()

# data handling and printing
library(tidyverse)
library(glue)
library(magrittr)
library(patchwork)
library(doFuture)
# imputation
library(mice)
# also requires furrr to be installed for parallel computations
# modeling
library(glmnet)
# used but not loaded
# rms

df = read.csv("Data/Raw/Bacteremia_public_S2.csv",
              header = TRUE, sep = ",", dec = ".") %>%
    mutate(
        BloodCulture = if_else(BloodCulture == "yes", 1, 0)
    )

SUFFIX = "_full"
if (params$set == "4K") {
    df = df[1:4000, ]
    SUFFIX = "_4K"
}

2.1 Helper functions

Code for helper functions that conduct CV, bootstrap etc…

Code
# custom functions
pseudo_log <- function(x, sigma = 1, base = 10) {
    asinh(x / (2 * sigma)) / log(base)
}

optimise_pseudo_log <- function(x, base = 10, sigmas = 2^seq(-10, 10, 1)) {
    qnormx = qnorm((1:length(x) - 0.5) / length(x))
    correlations = map_dbl(sigmas, ~ cor(qnormx, sort(pseudo_log(x, .x))))
    sigmas[correlations == max(correlations)]
}

plot_density_and_box <- function(plotdf, v) {
    list(
        ggplot(plotdf, aes(x = .data[[v]])) + 
            geom_density() + 
            theme_bw() +
            theme(axis.title.x = element_blank(), 
                  axis.text.x = element_blank(), 
                  axis.ticks.x = element_blank(),
                  plot.margin = margin()), 
        ggplot(plotdf, aes(x = .data[[v]], y = 0)) + 
            geom_boxplot(outlier.shape = NA) +
            geom_jitter(height = 0.25, alpha = 0.1) +
            theme_bw() + 
            theme(axis.text.y = element_blank(), 
                  axis.title.y = element_blank(),
                  axis.ticks.y = element_blank(),
                  panel.grid.minor.y = element_blank(), 
                  panel.grid.major.y = element_blank(), 
                  plot.margin = margin(0))
    ) %>% 
        wrap_plots(nrow = 2, heights = c(0.8, 0.2))
}

#' Extracts information for scaling input data
#' 
#' @details 
#' Scaling refers to centering at zero and unit variance. Binary or factor 
#' variables are NOT scaled. 
#' 
#' @return 
#' Returns a list that can be used to scale data, fitted on the input data.
get_scale <- function(d) {
    d %>% 
        map(
            ~ {
                # sadly case_when doesnt work here, as it cannot mix strings 
                # and numeric values, and does not allow mixed length output
                out = list()
                if (is.logical(.x))
                    out = list(NA, NA, "Logical, not scaled")
                
                if (is_empty(out)) {
                    if (is.factor(.x)) 
                        out = list(NA, NA, "Factor, not scaled")    
                }
                
                if (is_empty(out)) {
                    if (n_distinct(.x) < 10) 
                        out = list(NA, NA, "Fewer than 10 distinct values, not scaled")    
                }
                
                if (is_empty(out)) {
                    out = list(mean(.x, na.rm = TRUE), sd(.x, na.rm = TRUE), 
                         "scaled")
                }
                
                out
            }
        )
}

#' Applies extracted scale to data.frame
#' 
#' @param d
#' Data.frame
#' @param scaler
#' List of scalers
#' 
#' @details 
#' All columns not found in scaler are left as they are.
#' 
#' @return 
#' Scaled data.frame
apply_scale <- function(d, scaler) {
    d_scaled = d
    
    for (i in seq_along(scaler)) {
        s = scaler[[i]]
        if (is.na(s[[1]])) next
        v = names(scaler)[i]
        d_scaled[[v]] = apply_scale_variable(d[[v]], s)
    }
    
    d_scaled
}

#' Apply scale to individual vector
#' 
#' @param v
#' Vector of data.
#' @param scaler
#' Scaler.
#' 
#' @return 
#' Scaled vector if vector is numeric, otherwise left unchanged.
apply_scale_variable <- function(v, scaler) {
    v_scaled = v
    if (scaler[[3]] == "scaled") {
        v_scaled = (v - scaler[[1]]) / scaler[[2]]
    }
    
    v_scaled
}

#' Extracts information for using pseudolog
#' 
#' @details 
#' Binary or factor variables are NOT transformed. 
#' 
#' @return 
#' Returns a list that can be used to transform data, fitted on the input data.
get_pseudolog <- function(d, base = 10) {
    d %>% 
        map(
            ~ {
                # sadly case_when doesnt work here, as it cannot mix strings 
                # and numeric values, and does not allow mixed length output
                out = list()
                if (is.logical(.x))
                    out = list(NA, NA, "Logical, not transformed")
                
                if (is_empty(out)) {
                    if (is.factor(.x)) 
                        out = list(NA, NA, "Factor, not transformed")    
                }
                
                if (is_empty(out)) {
                    if (n_distinct(.x) < 10) 
                        out = list(NA, NA, "Fewer than 10 distinct values, not transformed")    
                }
                
                if (is_empty(out)) {
                    out = list(optimise_pseudo_log(.x, base = base), 
                         "transformed")
                }
                
                out
            }
        )
}

#' Applies extracted pseudolog parameter to data.frame
#' 
#' @param d
#' Data.frame
#' @param transformer
#' List of transformers
#' 
#' @details 
#' All columns not found in transformer are left as they are.
#' 
#' @return 
#' Scaled data.frame
apply_pseudolog <- function(d, transformer, base = 10) {
    d_trafo = d
    
    for (i in seq_along(transformer)) {
        s = transformer[[i]]
        if (is.na(s[[1]])) next
        v = names(transformer)[i]
        d_trafo[[v]] = apply_pseudolog_variable(d[[v]], s, base = base)
    }
    
    d_trafo
}

#' Apply transform to individual vector
#' 
#' @param v
#' Vector of data.
#' @param transformer
#' Transformer
#' 
#' @return 
#' Transformed vector if vector is numeric, otherwise left unchanged.
apply_pseudolog_variable <- function(v, transformer, base = 10) {
    v_trafo = v
    if (transformer[[2]] == "transformed") {
        v_trafo = pseudo_log(v, transformer[[1]], base = base)
    }
    
    v_trafo
}

#' Helper to create cross-validation fold indices
#' 
#' @details
#' To ensure reproducibility, call `set.seed` or manage seed otherwise outside
#' this function call. 
#' 
#' @return
#' List of indices for cross-validation.
make_cv_stratified_folds <- function(d, outcome, n_folds) {
    # draw samples stratified by outcome
    folds = by(1:nrow(d), d[[outcome]], function(f) {
        by(sample(f), cut(1:length(f), n_folds, include.lowest = TRUE), identity)
    }) 
    # now merge stratified resamples
    map2(folds[[1]], folds[[2]], ~ c(.x, .y)) %>% unname()
}

#' Helper to compute performance metrics for logistic lasso model
#'
#' @details
#' Uses `rms::val.prob` to compute most prediction metrics.
compute_performance_lasso <- function(m, x, y) {
    predict(m, newx = x, type = "response") %>% 
        apply(2, 
              function(p) tryCatch(
                  rms::val.prob(p, y, pl = FALSE)[
                  c("C (ROC)", "Brier", "Intercept", "Slope")
              ], 
              error = function(e) rep(NA, 4)
              )) %>%  
        t() %>% 
        as.data.frame() %>% 
        set_names(c("AUC", "Brier", "Intercept", "Slope")) %>% 
        mutate(
            lambda = m$lambda, 
            n_nonzero = colSums(as.matrix(m$beta) != 0)
        )
}

#' Helper function to extract maximal lambda for given number of coefficients
#' 
#' @details
#' Extracts the maximal lamba in the lambda sequence such that the number of 
#' nonzero coefficients is less or equal than n. 
get_lambda_ncoef <- function(m, n) {
    n_nonzero = colSums(as.matrix(m$beta) != 0)
    ind = which.min(abs(n_nonzero - n))
    if (length(ind) > 1) ind = ind[1]
    m$lambda[ind]
}


#' Helper to fit adaptive logistic lasso model
#' 
#' @return 
#' List with entries: 
#' 
#' - `m_glm`: the weight model object
#' - `pf`: penalty factors derived from `m_glm`
#' - `m_adalasso`: adaptive lasso glmnet object
#' - `perf`: apparent performance
#' - `scaler`: scaler object
fit_adaptive_lasso <- function(d, f, lambda = NULL, gamma = 0.5) {
    # standardize, in addition to SAP
    scaler = get_scale(d)
    d %<>% apply_scale(scaler) %>% 
        mutate(SEX = SEX - 1)
    
    # computation of penalty factors according to SAP
    m_glm = glm(f, data = d, family = "binomial")
    pf = 1 / abs(coef(m_glm)[-1])^gamma
    
    x = model.matrix(f, d)[, -1]
    y = factor(d$BloodCulture)
    m_adalasso = glmnet(
        x, y, 
        # settings according to SAP
        family = "binomial", 
        penalty.factor = pf, 
        alpha = 1,
        # note standardization is false, as data was standardized before
        standardize = FALSE, 
        # user specified lambda (sequence)
        lambda = lambda, 
        # speed up by lowering number of lambdas to evaluate
        nlambda = 50, 
        lambda.min.ratio = 1e-8
    )
    
    perf = compute_performance_lasso(m_adalasso, x, d$BloodCulture)
    
    list(
        m_glm = m_glm, 
        pf = pf,
        m_adalasso = m_adalasso, 
        perf = perf, 
        scaler = scaler
    )
}


#' Helper to run cross-validation
#'
#' @details
#' Handle set up of futures outside of this function.
do_cv <- function(d, folds, f_m, lambda, gamma = 0.5, seed = 678) {
    foreach(
        i = seq_along(folds), 
        .errorhandling = "pass", 
        .options.future = list(seed = seed)
    ) %dofuture% {
        ind_v = folds[[i]]
        ind_t = unlist(folds[-i])
        d_t = d[ind_t, ]
        d_v = d[ind_v, ] 
        
        # use same lambda sequence for all folds, see default
        # alignment = "lambda" option in cv.glmnet
        res_t = fit_adaptive_lasso(d_t, f_m, lambda = lambda, gamma = gamma)
        
        d_v %<>% 
            apply_scale(res_t$scaler) %>% 
            mutate(SEX = SEX - 1)
        x_v = model.matrix(f_m, d_v)[, -1]
        perf_v = compute_performance_lasso(res_t$m_adalasso, x_v, 
                                           d_v[[all.vars(f_m)[1]]])
        
        perf_v
    } 
}

#' Run single bootstrap sample
#' 
#' @details
#' Handle seed and parallelisation outside this function.
do_bootstrap <- function(df, b_ind, variables,  
                         imp_seed, 
                         imp_m = 5, imp_maxit = 20) {
    res_out = list()
    
    d_b = df[b_ind, ]
    d_oob = df[setdiff(1:nrow(df), b_ind), ]
    
    # impute for training part
    m_imp_b = d_b %>% 
        select(one_of(variables), BloodCulture) |>
        futuremice(
            m = imp_m, method = "pmm", 
            maxit = imp_maxit, 
            parallelseed = imp_seed, n.core = 1, future.plan = "sequential"
        )
    
    # impute out of bag samples
    m_imp_oob = d_oob %>% 
        select(one_of(variables), BloodCulture) |>
        futuremice(
            m = imp_m, method = "pmm", 
            maxit = imp_maxit, 
            parallelseed = imp_seed, n.core = 1, future.plan = "sequential"
        )
    
    # fit and evaluate model for all imputations
    f_m = glue("BloodCulture ~ {paste0(variables, collapse = '+')}") %>%
        as.formula()
    res_imp = list()
    for (i in 1:imp_m) {
        # fit model
        d_b_imp = complete(m_imp_b, i) 
        transformer = get_pseudolog(d_b_imp)
        d_b_imp %<>% apply_pseudolog(transformer)
        res_out = fit_adaptive_lasso(d_b_imp, f_m)
        res_out$folds = make_cv_stratified_folds(d_b_imp, "BloodCulture", 5)
        res_out$seed_inner = sample.int(10e8, 1)
        res_out$res_cv = do_cv(d_b_imp, res_out$folds, f_m,
                               res_out$m_adalasso$lambda, seed = res_out$seed_inner)
        
        # obtain optimal lambda from cv
        res_out$opt_lambda = bind_rows(res_out$res_cv) %>% 
            group_by(lambda) %>% 
            # average across folds
            summarise(across(everything(), mean)) %>% 
            filter(AUC == max(AUC, na.rm = TRUE)) %>% 
            pull(lambda)
        
        # store apparent performance at optimal lambda
        res_out$perf_opt = res_out$perf[res_out$perf$lambda == res_out$opt_lambda, ]
        
        # evaluate models on respective oob samples
        d_oob_imp = complete(m_imp_oob, i) %>% 
            apply_pseudolog(transformer) %>% 
            apply_scale(res_out$scaler) %>% 
            mutate(SEX = SEX - 1)
        x_oob = model.matrix(f_m, d_oob_imp)[, -1]
        perf_oob = compute_performance_lasso(res_out$m_adalasso, x_oob, 
                                           d_oob_imp[[all.vars(f_m)[1]]])
        res_out$perf_oob_opt = perf_oob[perf_oob$lambda == res_out$opt_lambda, ]
        
        res_imp[[i]] = res_out
    }
    
    # obtain p_Mi_Di_Dic by averaging over imputations
    # also store p_Mi_Di_Di for confidence intervals
    list(
        p_Mi_Di_Dic = map(res_imp, "perf_oob_opt") %>%
            bind_rows(.id = "imputation"),
        p_Mi_Di_Di = map(res_imp, "perf_opt") %>% 
            bind_rows(.id = "imputation")
    )
}

3 Variables

As outlined in the SAP, the set of variables for modeling is reduced according to collinearities and high variance inflation factors.

Code
VARIABLES = names(df) %>%
    setdiff(c(
        # exclude identifier and outcome
        "ID", "BloodCulture",
        # exclude ratio variables
        "MONOR", "LYMR", "NEUR", "EOSR", "BASOR", 
        # exclude high vif
        "MCV", "HCT"
    ))

glue("{length(VARIABLES)} Variables used for modeling according to SAP: 
     {VARIABLES |> paste0(collapse = ', ')}")
44 Variables used for modeling according to SAP: 
SEX, AGE, HGB, PLT, MCH, MCHC, RDW, MPV, LYM, MONO, EOS, BASO, NT, APTT, FIB, SODIUM, POTASS, CA, PHOS, MG, CREA, BUN, HS, GBIL, TP, ALB, AMY, PAMY, LIP, CHE, AP, ASAT, ALAT, GGT, LDH, CK, GLU, TRIG, CHOL, CRP, NEU, PDW, RBC, WBC
Code
# further changes according to diagnostics from imputation, see below
VARIABLES %<>% 
    setdiff(c(
        # exclude due to extreme correlation
        "NEU", "PDW"
    ))

glue("Final {length(VARIABLES)} Variables used for modeling: 
     {VARIABLES |> paste0(collapse = ', ')}")
Final 42 Variables used for modeling: 
SEX, AGE, HGB, PLT, MCH, MCHC, RDW, MPV, LYM, MONO, EOS, BASO, NT, APTT, FIB, SODIUM, POTASS, CA, PHOS, MG, CREA, BUN, HS, GBIL, TP, ALB, AMY, PAMY, LIP, CHE, AP, ASAT, ALAT, GGT, LDH, CK, GLU, TRIG, CHOL, CRP, RBC, WBC

4 Imputation

Imputation model is only re-fitted if no model file in is found. Note that we used a parallel version of mice to speed up computations.

4.1 Clarifications / deviations from SAP

Based on initial runs and checking trace plots for diagnostics it was clear that the initial set of variables as outlined in the SAP did not work well with imputation due to difficult sampling from the Markov chains. The reason for this were extremely high correlation between certain pairs of variables. In theory, one could just increase the number of iterations to obtain mixing chains. To keep the analysis feasible however, further variables were excluded from analysis to reduce autocorrelation of the chains:

  • NEU (neutrophiles) were extremely strongly correlated (0.97) with WBC (white blood cells), even though they represent only a subset of white blood cells. Both variables were affected by badly mixing chains, so NEU was removed.
  • PDW (platelet distribution width) and MPV (mean platelet volume) showed extremely high correlation (0.94), so PDW was removed.

Note that these choices were not made based on any association with the outcome, but purely for reasons of imputation and correlation. Increasing the number of iterations beyond 100 would have removed these issues as well, but that leads to very long runtimes for bootstrapping.

Further, the number of iterations in the mice algorithm was not mentioned in the SAP. Based on MCMC traceplots diagnostics it was increased to 20 from the default value to accomodate high autocorrelation in the chains (which is itself due to high correlations between groups of variables as seen in the correlation matrix). The value of 20 was conservative, even 15 might have been enough.

4.2 Fit / load

Code
file_model_imputation = file.path(params$folder_analysis, 
                                  sprintf("model_imputation%s.rds", SUFFIX))
if (!file.exists(file_model_imputation)) {
    glue("Fitting model and saving to {file_model_imputation}") %>% print()
    t = now()
    m_imp = df %>% 
        select(one_of(VARIABLES), BloodCulture) |>
        futuremice(
            # specified according to SAP
            m = 20, method = "pmm", 
            # based on diagnostics
            maxit = 20, 
            # implementation specific, for parallelisation 
            parallelseed = 35, n.core = 10, future.plan = "multisession"
        )
    dt = difftime(now(), t, units = "s") %>% as.numeric()
    
    list(model = m_imp, 
         runtime = dt) %>%
        saveRDS(file_model_imputation)    
} else {
    glue("Reading fitted model from {file_model_imputation}") %>% print()
    m_imp = readRDS(file_model_imputation)
    dt = m_imp$runtime
    m_imp = m_imp$model
}
Reading fitted model from Data/Processed/model_imputation_4K.rds

4.3 Check

Mixing of the chains looks well after increasing iterations.

Code
glue("Fitting time: {dt} seconds")
Fitting time: 95.1408700942993 seconds
Code
print("Method used for imputation (empty means no missing data):")
[1] "Method used for imputation (empty means no missing data):"
Code
m_imp$method
         SEX          AGE          HGB          PLT          MCH         MCHC 
          ""           ""        "pmm"        "pmm"        "pmm"        "pmm" 
         RDW          MPV          LYM         MONO          EOS         BASO 
       "pmm"        "pmm"        "pmm"        "pmm"        "pmm"        "pmm" 
          NT         APTT          FIB       SODIUM       POTASS           CA 
       "pmm"        "pmm"        "pmm"        "pmm"        "pmm"        "pmm" 
        PHOS           MG         CREA          BUN           HS         GBIL 
       "pmm"        "pmm"        "pmm"        "pmm"        "pmm"        "pmm" 
          TP          ALB          AMY         PAMY          LIP          CHE 
       "pmm"        "pmm"        "pmm"        "pmm"        "pmm"        "pmm" 
          AP         ASAT         ALAT          GGT          LDH           CK 
       "pmm"        "pmm"        "pmm"        "pmm"        "pmm"        "pmm" 
         GLU         TRIG         CHOL          CRP          RBC          WBC 
       "pmm"        "pmm"        "pmm"        "pmm"        "pmm"        "pmm" 
BloodCulture 
          "" 
Code
plot(m_imp)

5 Example model fit in one imputation

Predictor distributions before and after optimized pseudolog transformation.

Code
d_imp = complete(m_imp, 1)

# visualise original distributions
d_imp %>% 
    select(one_of(VARIABLES), -SEX) %>% 
    pivot_longer(everything()) %>% 
    ggplot(aes(x = value)) + 
    geom_density() +
    facet_wrap(~ name, ncol = 5, scales = "free") + 
    theme_bw()

Code
d_imp %<>%
    # symmetrize according to SAP
    mutate(across(-c(SEX, AGE, BloodCulture), 
                  ~ pseudo_log(.x, optimise_pseudo_log(.x))))

# visualise all transformed marker distributions
# they all should be somewhat symmetric after the pseudo-log transformation
d_imp %>% 
    select(one_of(VARIABLES), -SEX) %>% 
    pivot_longer(everything()) %>% 
    ggplot(aes(x = value)) + 
    geom_density() +
    facet_wrap(~ name, ncol = 5, scales = "free") + 
    theme_bw()

Fitting adaptive Lasso.

Code
f_m = glue("BloodCulture ~ {paste0(VARIABLES, collapse = '+')}") %>% 
    as.formula()

res = fit_adaptive_lasso(d_imp, f_m)

Check calibration and performance of weight model.

Code
rms::val.prob(predict(res$m_glm, type = "response"), d_imp$BloodCulture)

          Dxy       C (ROC)            R2             D      D:Chi-sq 
 5.840844e-01  7.920422e-01  2.042029e-01  9.382351e-02  3.762941e+02 
          D:p             U      U:Chi-sq           U:p             Q 
           NA -5.000000e-04 -2.501110e-12  1.000000e+00  9.432351e-02 
        Brier     Intercept         Slope          Emax           E90 
 6.804033e-02 -1.789453e-11  1.000000e+00  2.754732e-02  8.831099e-03 
         Eavg           S:z           S:p 
 4.180723e-03  1.210943e-01  9.036164e-01 

Weight distribution (i.e. penalty factors).

Code
plot_density_and_box(data.frame(penalty_factor = res$pf), "penalty_factor")

Adaptive lasso coefficient paths (shows difficulty to obtain simple set of covariates).

Code
plot(res$m_adalasso, xvar = "lambda")

5.1 CV

Doing 5-fold CV.

Code
set.seed(15)
folds = make_cv_stratified_folds(d_imp, "BloodCulture", 5)
glue("Outcome prevalence in CV folds:")
Outcome prevalence in CV folds:
Code
map_dbl(folds, ~ d_imp$BloodCulture[.x] %>% mean())
[1] 0.08489388 0.08385482 0.08500000 0.08385482 0.08489388
Code
plan(multisession, workers = 5)
res_cv = do_cv(d_imp, folds, f_m, res$m_adalasso$lambda, seed = 1868)
plan(sequential)

5.2 Results

Performance metrics per cross-validation fold (out of fold data).

Code
plot_cv = bind_rows(res_cv, .id = "fold")

ggplot(plot_cv, aes(x = log(lambda), y = AUC, color = fold)) + 
    geom_point() + 
    stat_summary(fun.y = mean, color = "black", geom = "line") +
    theme_bw() + 
    theme(legend.position = "top")

Code
ggplot(plot_cv, aes(x = n_nonzero, y = AUC, color = fold)) + 
    geom_point() + 
    geom_smooth(aes(x = n_nonzero, y = AUC), inherit.aes = FALSE, color = "black", 
                se = FALSE) +
    theme_bw() + 
    theme(legend.position = "top")
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Code
ggplot(plot_cv, aes(x = log(lambda), y = Brier, color = fold)) + 
    geom_point() + 
    stat_summary(fun.y = mean, color = "black", geom = "line") +
    theme_bw() + 
    theme(legend.position = "top")

Code
ggplot(plot_cv, aes(x = log(lambda), y = Slope, color = fold)) + 
    geom_point() + 
    stat_summary(fun.y = mean, color = "black", geom = "line") +
    coord_cartesian(ylim = c(0, 10)) +
    theme_bw() + 
    theme(legend.position = "top")

Code
ggplot(plot_cv, aes(x = n_nonzero, y = Slope, color = fold)) + 
    geom_point() + 
    geom_smooth(aes(x = n_nonzero, y = Slope), inherit.aes = FALSE, color = "black", 
                se = FALSE) +
    theme_bw() + 
    theme(legend.position = "top")
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Code
perf_cv = plot_cv %>% group_by(lambda) %>% 
    summarise(
        across(c(AUC, Brier, Slope, Intercept), 
               ~ mean(.x, na.rm = TRUE))
    )
ind_opt_lambda = which.max(perf_cv$AUC)
opt_lambda = perf_cv$lambda[ind_opt_lambda]
glue("Best performance at lambda {opt_lambda} using {sum(coef(res$m_adalasso, s = opt_lambda) != 0) - 1} variables:")
Best performance at lambda 0.00570402619302209 using 19 variables:
Code
glue("Apparent AUC {res$perf$AUC[ind_opt_lambda]} / CV AUC {perf_cv$AUC[ind_opt_lambda]}
    Apparent Brier score {res$perf$Brier[ind_opt_lambda]} / CV Brier score {perf_cv$Brier[ind_opt_lambda]}
    Apparent calibration intercept {res$perf$Intercept[ind_opt_lambda]} / CV calibration intercept {perf_cv$Intercept[ind_opt_lambda]}
    Apparent calibration slope {res$perf$Slope[ind_opt_lambda]} / CV calibration slope {perf_cv$Slope[ind_opt_lambda]}")
Apparent AUC 0.791580893164727 / CV AUC 0.767500246924879
Apparent Brier score 0.0681213905915638 / CV Brier score 0.0702532038702263
Apparent calibration intercept 0.0578695359124389 / CV calibration intercept 0.118695373067437
Apparent calibration slope 1.02937513341044 / CV calibration slope 1.05869245801805
Code
coef_m = coef(res$m_adalasso, s = opt_lambda)
data.frame(dimnames(coef_m)[1], as.numeric(coef_m)) %>% 
    set_names(c("Variable", "Coefficient")) %>%
    arrange(desc(abs(Coefficient)))
Variable Coefficient
(Intercept) -2.7258746
AP 0.3826980
MONO -0.3717263
WBC 0.3450730
EOS -0.3095569
CHOL -0.2708813
LYM -0.2699901
MG -0.2208895
BUN 0.1934051
LDH -0.1808838
SODIUM -0.1800621
CK 0.1645244
AGE 0.1584823
ALAT 0.0921322
PHOS -0.0915088
CREA 0.0751253
APTT -0.0533024
MCH 0.0275662
RBC -0.0089462
SEX 0.0075623
HGB 0.0000000
PLT 0.0000000
MCHC 0.0000000
RDW 0.0000000
MPV 0.0000000
BASO 0.0000000
NT 0.0000000
FIB 0.0000000
POTASS 0.0000000
CA 0.0000000
HS 0.0000000
GBIL 0.0000000
TP 0.0000000
ALB 0.0000000
AMY 0.0000000
PAMY 0.0000000
LIP 0.0000000
CHE 0.0000000
ASAT 0.0000000
GGT 0.0000000
GLU 0.0000000
TRIG 0.0000000
CRP 0.0000000

6 Main model fit on all imputations

According to SAP the main model is fitted on the 20 imputed datasets and the apparent performance is computed. For this we simply repeat the steps done for a single imputation.

Code
f_m = glue("BloodCulture ~ {paste0(VARIABLES, collapse = '+')}") %>% 
    as.formula()
plan(multisession, workers = 10)

res_imp = foreach(
        i = 1:m_imp$m, 
        .errorhandling = "pass", 
        .options.future = list(seed = 156, 
                               packages = c("mice", "glmnet"))
) %dofuture% {
    t_start = Sys.time()
    
    # res_out = list()
    # res_out$packages = loadedNamespaces()
    d_imp = complete(m_imp, i) %>%
        mutate(across(-c(SEX, AGE, BloodCulture),
                      ~ pseudo_log(.x, optimise_pseudo_log(.x))))
    res_out = fit_adaptive_lasso(d_imp, f_m)
    res_out$folds = make_cv_stratified_folds(d_imp, "BloodCulture", 5)
    res_out$seed_inner = sample.int(10e8, 1)
    res_out$res_cv = do_cv(d_imp, res_out$folds, f_m,
                           res_out$m_adalasso$lambda, seed = res_out$seed_inner)
    res_out$time = difftime(Sys.time(), t_start, units = "s") %>% 
        as.numeric()

    res_out
}

plan(sequential)

6.1 Results

Performance metrics over all imputations.

Code
glue("Mean runtime per imputation {map_dbl(res_imp, 'time') %>% mean()} seconds.")
Mean runtime per imputation 6.34247155189514 seconds.
Code
perf_cv_imp = map(res_imp, ~ .x$res_cv %>% bind_rows(.id = "fold")) %>% 
    bind_rows(.id = "imputation") %>% 
    mutate(imputation = fct_inseq(imputation), 
           fold = fct_inseq(fold))

perf_cv_imp %>% 
    select(-n_nonzero) %>% 
    group_by(imputation, lambda) %>% 
    select(-fold) %>% 
    summarise(across(everything(), mean)) %>% 
    pivot_longer(-c(imputation, lambda)) %>% 
    filter(value < 10) %>% 
    ggplot(aes(x = log(lambda), y = value, color = imputation)) + 
    geom_line(alpha = 0.5) + 
    facet_wrap(~ name, ncol = 2, scales = "free_y") + 
    theme_bw()
`summarise()` has grouped output by 'imputation'. You can override using the
`.groups` argument.

Performance metrics across imputations and along different number of coefficients.

Code
perf_cv_imp %>% 
    select(-lambda) %>% 
    group_by(imputation, n_nonzero) %>% 
    select(-fold) %>% 
    summarise(across(everything(), mean)) %>% 
    pivot_longer(-c(imputation, n_nonzero)) %>% 
    filter(value < 10) %>% 
    ggplot(aes(x = n_nonzero, y = value, color = imputation)) + 
    geom_line(alpha = 0.5) + 
    facet_wrap(~ name, ncol = 2, scales = "free_y") + 
    theme_bw()
`summarise()` has grouped output by 'imputation'. You can override using the
`.groups` argument.

Obtain optimal lambda in each imputation and report metrics over imputations and coefficients.

Code
perf_cv_imp_opt = perf_cv_imp %>% 
    group_by(imputation, lambda) %>% 
    select(-fold) %>% 
    summarise(across(everything(), mean)) %>% 
    group_by(imputation) %>% 
    # na.rm as sometimes with extreme penalization there are NA values
    filter(AUC == max(AUC, na.rm = TRUE)) %>% 
    arrange(imputation)
`summarise()` has grouped output by 'imputation'. You can override using the
`.groups` argument.
Code
perf_cv_imp_opt %>% 
    pivot_longer(-c(imputation, lambda)) %>% 
    ggplot(aes(x = value)) + 
    geom_density() + 
    geom_rug() +
    facet_wrap(~ name, ncol = 2, scales = "free") +
    theme_bw()

Number of non-zero coefficients across imputations.

Code
coef_imp = map(res_imp, "m_adalasso") %>% 
    imap(~ coef(.x, s = perf_cv_imp_opt$lambda[.y])) %>% 
    map(~ as.matrix(.x) %>% as.data.frame()) %>% 
    map(~ rownames_to_column(.x, "variable")) %>% 
    bind_rows(.id = "imputation") %>% 
    rename(coefficient = "s1") %>% 
    mutate(imputation = fct_inseq(imputation))

coef_imp %>% 
    group_by(imputation) %>% 
    summarise(nnz = sum(coefficient != 0)) %>% 
    pull(nnz) %>% 
    summary()
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  21.00   24.75   28.00   29.30   32.25   43.00 
Code
coef_imp %>% 
    filter(variable != "(Intercept)") %>% 
    ggplot(aes(x = imputation, y = coefficient, group = variable, 
               color = variable)) + 
    geom_hline(yintercept = 0, linetype = "dashed") +
    geom_point() + 
    geom_line() + 
    facet_wrap(~ variable, ncol = 5) +
    scale_color_viridis_d() +
    theme_bw() + 
    theme(legend.position = "none")

Number of imputations a variable was chosen in.

Code
coef_imp %>% 
    group_by(variable) %>% 
    summarise(n_imp = sum(coefficient != 0)) %>% 
    pull(n_imp) %>% 
    table()
.
 3  4  5  6  7  8  9 10 11 12 13 15 16 18 19 20 
 2  2  3  3  2  1  1  2  2  1  1  2  1  1  3 16 
Code
coef_imp %>% 
    group_by(imputation) %>%
    arrange(desc(abs(coefficient)), .by_group = TRUE) %>% 
    slice_head(n = 10) %>% 
    pull(variable) %>% 
    table()
.
(Intercept)         AMY          AP        ASAT         BUN        CHOL 
         20           7          20          13          12          15 
       CREA         EOS         HGB         LDH         LYM         MCH 
          3          19           4          16          20           2 
         MG        MONO         RBC         SEX      SODIUM         WBC 
          3          20           4           1           1          20 

Obtain lambda for at most 10 nonzero coefficients per imputation and report metrics.

Code
perf_cv_imp_10 = perf_cv_imp %>% 
    group_by(imputation, fold) %>%
    mutate(diff = 10 - n_nonzero) %>% 
    # chose lambda closest to 10 variables per fold, but not more than 10
    # if more than one choice then take larger lambda
    filter(n_nonzero <= 10) %>% 
    filter(diff == min(diff)) %>% 
    filter(lambda == max(lambda)) %>% 
    ungroup() %>% 
    select(-fold) %>% 
    group_by(imputation) %>% 
    summarise(across(everything(), mean)) %>% 
    arrange(imputation) %>% 
    select(-diff)

perf_cv_imp_10 %>% 
    pivot_longer(-c(imputation, lambda)) %>% 
    ggplot(aes(x = value)) + 
    geom_density() + 
    geom_rug() +
    facet_wrap(~ name, ncol = 2, scales = "free") +
    theme_bw()

Final coefficients by averaging over imputations. Do this for optimal lambda and for lambdas with at most 10 variables.

Code
coef_imp_10 = map(res_imp, "m_adalasso") %>% 
    imap(~ coef(.x, s = perf_cv_imp_10$lambda[.y])) %>% 
    map(~ as.matrix(.x) %>% as.data.frame()) %>% 
    map(~ rownames_to_column(.x, "variable")) %>% 
    bind_rows(.id = "imputation") %>% 
    rename(coefficient_10 = "s1") %>% 
    mutate(imputation = fct_inseq(imputation))

coef_final = coef_imp %>% 
    bind_cols(coef_imp_10 %>% select(coefficient_10)) %>% 
    group_by(variable) %>% 
    summarise(coefficient_mean = mean(coefficient), 
              coefficient_10_mean = mean(coefficient_10), 
              coefficient_median = median(coefficient), 
              coefficient_10_median = median(coefficient_10))

coef_final %>% 
    arrange(desc(abs(coefficient_mean))) %>% 
    data.frame()
variable coefficient_mean coefficient_10_mean coefficient_median coefficient_10_median
(Intercept) -2.8571926 -2.4891531 -2.8484012 -2.4862197
WBC 0.4618299 0.0958697 0.4488327 0.0915915
MONO -0.4497013 -0.1214846 -0.4604149 -0.1050824
AP 0.3459738 0.2340095 0.3562799 0.2462519
LYM -0.3102279 -0.2589009 -0.3106987 -0.2645914
EOS -0.2951762 -0.1717925 -0.2945936 -0.1735760
CHOL -0.2711473 -0.0733954 -0.2712389 -0.0642798
LDH -0.2618009 -0.0020509 -0.2649717 0.0000000
ASAT 0.2137613 0.0123157 0.2354568 0.0000000
BUN 0.2005082 0.0969528 0.1997746 0.1016956
AMY -0.1906833 -0.0096999 -0.1840483 0.0000000
MG -0.1779993 -0.0328142 -0.1702042 -0.0147155
SODIUM -0.1716689 -0.0350216 -0.1764753 -0.0302401
CREA 0.1610721 0.0082338 0.1610068 0.0000000
AGE 0.1572950 0.0123930 0.1562167 0.0026499
PHOS -0.1410352 -0.0009506 -0.1372019 0.0000000
SEX 0.1255332 0.0000000 0.1092733 0.0000000
HGB -0.1169661 0.0000000 -0.0270699 0.0000000
APTT -0.0993126 0.0000000 -0.1111690 0.0000000
RBC 0.0876142 0.0000000 0.0000000 0.0000000
MCH 0.0839192 0.0000442 0.0402197 0.0000000
CK 0.0821102 0.0000000 0.0768145 0.0000000
RDW -0.0750027 0.0000000 -0.0640553 0.0000000
ALAT 0.0480337 0.0029510 0.0079780 0.0000000
PLT -0.0379212 0.0000000 -0.0156573 0.0000000
PAMY 0.0336205 0.0000000 0.0000000 0.0000000
NT 0.0323688 0.0000000 0.0147046 0.0000000
GGT 0.0290822 0.0000000 0.0127746 0.0000000
BASO -0.0247724 0.0000000 -0.0169627 0.0000000
MCHC -0.0238719 0.0000000 0.0000000 0.0000000
CA 0.0220024 0.0000000 0.0001072 0.0000000
LIP 0.0206742 0.0000000 0.0000000 0.0000000
CHE -0.0186180 0.0000000 0.0000000 0.0000000
MPV -0.0175125 0.0000000 0.0000000 0.0000000
CRP 0.0160889 0.0000000 0.0000000 0.0000000
POTASS 0.0148160 0.0000000 0.0000000 0.0000000
HS -0.0120766 0.0000000 0.0000000 0.0000000
TP 0.0071132 0.0000000 0.0000000 0.0000000
GBIL 0.0064162 0.0000000 0.0000000 0.0000000
FIB -0.0054983 0.0000000 0.0000000 0.0000000
TRIG 0.0034750 0.0000000 0.0000000 0.0000000
ALB 0.0018178 0.0000000 0.0000000 0.0000000
GLU 0.0013576 0.0000000 0.0000000 0.0000000

7 Model evaluation using Bootstrap

7.1 Apparent performance

Based on runtimes for the imputation we reduce the number of imputations per bootstrap to two, and reduce the number of bootstraps to 500. This is according to the SAP.

Code
# apparent performance estimates
# obtain tuned lambda and average imputation performance
perf_imp = imap(res_imp, 
                    ~ .x$perf %>% filter(lambda == perf_cv_imp_opt$lambda[.y])) %>% 
    bind_rows(.id = "imputation") %>% 
    select(-n_nonzero)

p_M_DD = perf_imp %>% 
    select(-c(imputation, lambda)) %>% 
    summarise(across(everything(), mean))

p_M_DD
AUC Brier Intercept Slope
0.7874327 0.0679403 0.202409 1.100855

7.2 Bootstrap indices

Code
set.seed(13376)
b = 500
bootstraps = map(1:b, ~ sample(1:nrow(df), nrow(df), replace = TRUE))

glue("Statistics of relative frequency of unique samples per bootstrap:")
Statistics of relative frequency of unique samples per bootstrap:
Code
(map_dbl(bootstraps, n_distinct) / nrow(df)) %>% summary()
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.6180  0.6290  0.6324  0.6323  0.6358  0.6500 

7.3 Run bootstrap

Code
file_bootstrap = file.path(params$folder_analysis, 
                           sprintf("bootstrap%s.rds", SUFFIX))
if (!file.exists(file_bootstrap)) {
    glue("Running bootstraps and saving to {file_bootstrap}") %>% print()
    
    plan(multisession, workers = 10)
    res_b = foreach(
        i = seq_along(bootstraps), 
        .errorhandling = "pass", 
        .options.future = list(seed = 15)
    ) %dofuture%  {
        t_start = Sys.time()
        res = do_bootstrap(df, b_ind = bootstraps[[i]], variables = VARIABLES, 
                           imp_seed = sample.int(10e8, 1), imp_m = 2, imp_maxit = 20)
        res$runtime = difftime(now(), t_start, units = "s") %>% as.numeric()
        res
    }
    plan(sequential)
    saveRDS(res_b, file_bootstrap)    
} else {
    glue("Reading bootstrap results from {file_bootstrap}") %>% print()
    res_b = readRDS(file_bootstrap)
}
Reading bootstrap results from Data/Processed/bootstrap_4K.rds
Code
glue("Runtime per bootstrap in hours:")
Runtime per bootstrap in hours:
Code
summary(map_dbl(res_b, 'runtime')) / 3600
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
0.02302 0.03367 0.03420 0.03418 0.03472 0.03879 

\(p_M^{0.632+}\) estimate of performance.

Code
# first compute p_Mi_Di_Dic in each bootstrap
p_Mi_Di_Dic = res_b %>% map_df("p_Mi_Di_Dic", .id = "bootstrap") %>% 
    group_by(bootstrap) %>% 
    select(-imputation, -lambda) %>% 
    summarise(across(everything(), mean))

# then average
p_M_BS = p_Mi_Di_Dic %>%
    select(-bootstrap) %>% 
    colMeans()

p_M_BS
        AUC       Brier   Intercept       Slope 
 0.74192015  0.07181641 -0.43679031  0.79033707 
Code
# define weights - but exclude brier score in this step
# define no-effect weights for intercept and slope by computing
# m0 = glm(BloodCulture ~ 1, data = d_imp, family = "binomial")
# rms::val.prob(predict(m0, type = "response"), d_imp$BloodCulture)
p_M_0 = c(0.5, -2.382719, 0)
R = (p_M_BS[-2] - p_M_DD[-2]) / (p_M_0 - p_M_DD[-2])
w = 0.632 / (1 - 0.368 * R)
p_M_0.632 = (1 - w) * p_M_DD[-2] + w * p_M_BS[-2]

p_M_0.632
AUC Intercept Slope
0.756889 -0.2420028 0.8818774

Compute quantiles for confidence intervals and show distribution of statistics from which they are derived.

Code
p_Mi_Di_Di = res_b %>% map_df("p_Mi_Di_Di", .id = "bootstrap") %>% 
    group_by(bootstrap) %>% 
    select(-imputation, -lambda) %>% 
    summarise(across(everything(), mean)) %>% 
    select(-bootstrap)

wi = p_Mi_Di_Di %>% apply(1, function(row) row - p_M_DD) %>% 
    bind_rows() %>% 
    select(-Brier)

wi %>% 
    pivot_longer(everything()) %>% 
    ggplot(aes(x = value)) + 
    geom_histogram(bins = 20) + 
    facet_wrap(~ name, ncol = 3, scales = "free") + 
    theme_bw()

Confidence intervals.

Code
# compute quantiles from distribution
xi = apply(wi, 2, quantile, probs = c(0.025, 0.975))

glue("Bootstrap evaluation including confidence intervals: ")
Bootstrap evaluation including confidence intervals: 
Code
glue("AUC: {p_M_0.632$AUC} ({p_M_0.632$AUC - abs(xi[1, 'AUC'])}, {p_M_0.632$AUC + xi[2, 'AUC']})")
AUC: 0.756889011156315 (0.74582551996003, 0.796370231701684)
Code
glue("Calibration intercept: {p_M_0.632$Intercept} ({p_M_0.632$Intercept - abs(xi[2, 'Intercept'])}, {p_M_0.632$Intercept + xi[2, 'Intercept']})")
Calibration intercept: -0.242002774902798 (-0.377948533325911, -0.106057016479686)
Code
glue("Calibration slope: {p_M_0.632$Slope} ({p_M_0.632$Slope - abs(xi[1, 'Slope'])}, {p_M_0.632$Slope + xi[2, 'Slope']})")
Calibration slope: 0.881877365723491 (0.785170058196541, 0.946163152569005)

8 Clarifications / deviations from SAP

  • Two more variables remove due to problems with imputation model (not anticipated in SAP, see above for details).
  • Iterations in imputation model increased based on diagnostics (not mentioned in SAP).
  • Data were standardised for all model fitting procedures (after imputation, not mentioned in SAP, but common for penalized models).
  • CV was stratified to keep similar outcome prevalence (not specified in SAP, done due to larger differences between folds).
  • Alignment of lambda sequences in the CV was based on model trained on full data (this was not explicitly specified in the protocol, but is the default in cv.glmnet so can be considered a “default” way to handle this).
  • Creating folds indices for CV in different imputations was now done independently (but you might choose the same indices for all imputations, this was not specified in the SAP).
  • There is a typo in Section 6, in the definition of \(\hat{p}_M^{D, D}\) (an index \(j\) is missing in the summation).
  • The construction of the 95% CIs is actually unclear, but also in the paper. Due to negative values I introduced an absolute value, but I’m not sure if that is correct.
  • The definition of the 0.632+ estimate requires non-effect weights for calibration slope and intercept. These were now estimated by an empty logistic model mimicking no association of the variables with the outcome.
  • The model with at most 10 variables is quite bad in terms of calibration performance (i.e. it overshrinks leading to high calibration slope in the validation set). Also discrimination is lower than for the optimal model. At the moment we do not evaluate this model in the bootstrap.

9 Further notes

  • A “fix” / bug in the optimisation of the pseudo-log optimisation was found when looking at the coefficient paths of the lasso model.
  • Even after reduction of dataset to 4000 observations (epv < 10) the number of imputations actually leads to a model that is not sparse. Choosing the median instead of the mean would lead to selection.
  • (Superseded by 29.8.2024) Using the full dataset, the model is not sparse, the lasso cannot remove a relevant number of variables as the CV performance is very flat. Arguably, the dataset is too large and allows to fit an essentially saturated model with all variables. Restricting the number of observations leads to more typical results, i.e. overfitting with all variables and a relevant selection by the lasso.

10 Conclusions

Overall the results from the analysis as outlined in the SAP are not that great. While discrimination is close to the published model (both full and 4K data), the calibration is not excellent during bootstrap validation. That is likely because the separate imputation of out-of-bootstrap data leads to slight differences in the distribution and miscalibrated predictions. This effect is smaller for the full dataset, indicating that the 4K dataset is “at the limit” for this kind of approach to work well.

Furthermore, the final model using all imputed datasets, despite using adaptive Lasso, is not sparse anymore due to many variables being taken up with small coefficients as they occur a handful of times in the imputed datasets. This is a big weakness of the approach to use the mean coefficient from imputations (although it is close to a “Bayesian” way to handle things, were sparsity also has to be enforced and is not a direct product of shrinkage posteriors). However, alternatives are not so clear and would require to “break” the principles of multiple imputation a bit by sharing information across imputations (e.g. by requiring a variable to be selected in at least 50% of imputations). Such an approach could be added here, however, to enforce more sparsity. Alternatively, one could drop in each imputation the variables with small coefficients and only then combine the imputations (this would be feasible for each imputation independently and thus fits to multiple imputation).

11 Sessioninfo

To build this notebook:

  • install quarto
  • make sure the data is in the correct folder (either in global Data folder, or as subfolder of the folder containing this notebook).
  • build using the make.R script (global Data), or using the Render button in RStudio (subfolder Data).
Code
params
$folder_analysis
[1] "Data/Processed"

$set
[1] "4K"
Code
glue("Runtime {difftime(Sys.time(), T_START, units = 'm') / 60} hours.")
Runtime 0.0159091177913878 hours.
Code
sessioninfo::session_info()
- Session info ---------------------------------------------------------------
 setting  value
 version  R version 4.4.2 (2024-10-31 ucrt)
 os       Windows 10 x64 (build 17763)
 system   x86_64, mingw32
 ui       RTerm
 language (EN)
 collate  German_Austria.1252
 ctype    German_Austria.1252
 tz       Europe/Vienna
 date     2025-04-22
 pandoc   3.2 @ C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/ (via rmarkdown)

- Packages -------------------------------------------------------------------
 package      * version date (UTC) lib source
 backports      1.5.0   2024-05-23 [1] CRAN (R 4.4.0)
 base64enc      0.1-3   2015-07-28 [1] CRAN (R 4.4.0)
 boot           1.3-31  2024-08-28 [2] CRAN (R 4.4.2)
 broom          1.0.6   2024-05-17 [1] CRAN (R 4.4.0)
 checkmate      2.3.1   2023-12-04 [1] CRAN (R 4.4.0)
 cli            3.6.2   2023-12-11 [1] CRAN (R 4.4.0)
 cluster        2.1.6   2023-12-01 [2] CRAN (R 4.4.2)
 codetools      0.2-20  2024-03-31 [2] CRAN (R 4.4.2)
 colorspace     2.1-0   2023-01-23 [1] CRAN (R 4.4.0)
 crayon         1.5.2   2022-09-29 [1] CRAN (R 4.4.0)
 data.table     1.15.4  2024-03-30 [1] CRAN (R 4.4.0)
 digest         0.6.35  2024-03-11 [1] CRAN (R 4.4.0)
 doFuture     * 1.0.1   2023-12-20 [1] CRAN (R 4.4.1)
 dplyr        * 1.1.4   2023-11-17 [1] CRAN (R 4.4.0)
 evaluate       0.23    2023-11-01 [1] CRAN (R 4.4.0)
 fansi          1.0.6   2023-12-08 [1] CRAN (R 4.4.0)
 farver         2.1.2   2024-05-13 [1] CRAN (R 4.4.0)
 fastmap        1.2.0   2024-05-15 [1] CRAN (R 4.4.0)
 forcats      * 1.0.0   2023-01-29 [1] CRAN (R 4.4.0)
 foreach      * 1.5.2   2022-02-02 [1] CRAN (R 4.4.0)
 foreign        0.8-87  2024-06-26 [2] CRAN (R 4.4.2)
 Formula        1.2-5   2023-02-24 [1] CRAN (R 4.4.0)
 future       * 1.33.2  2024-03-26 [1] CRAN (R 4.4.0)
 future.apply   1.11.2  2024-03-28 [1] CRAN (R 4.4.0)
 generics       0.1.3   2022-07-05 [1] CRAN (R 4.4.0)
 ggplot2      * 3.5.1   2024-04-23 [1] CRAN (R 4.4.0)
 glmnet       * 4.1-8   2023-08-22 [1] CRAN (R 4.4.0)
 globals        0.16.3  2024-03-08 [1] CRAN (R 4.4.0)
 glue         * 1.7.0   2024-01-09 [1] CRAN (R 4.4.0)
 gridExtra      2.3     2017-09-09 [1] CRAN (R 4.4.0)
 gtable         0.3.5   2024-04-22 [1] CRAN (R 4.4.0)
 Hmisc          5.1-3   2024-05-28 [1] CRAN (R 4.4.0)
 hms            1.1.3   2023-03-21 [1] CRAN (R 4.4.0)
 htmlTable      2.4.2   2023-10-29 [1] CRAN (R 4.4.0)
 htmltools      0.5.8.1 2024-04-04 [1] CRAN (R 4.4.0)
 htmlwidgets    1.6.4   2023-12-06 [1] CRAN (R 4.4.0)
 iterators      1.0.14  2022-02-05 [1] CRAN (R 4.4.0)
 jomo           2.7-6   2023-04-15 [1] CRAN (R 4.4.0)
 jsonlite       1.8.8   2023-12-04 [1] CRAN (R 4.4.0)
 knitr          1.46    2024-04-06 [1] CRAN (R 4.4.0)
 labeling       0.4.3   2023-08-29 [1] CRAN (R 4.4.0)
 lattice        0.22-6  2024-03-20 [2] CRAN (R 4.4.2)
 lifecycle      1.0.4   2023-11-07 [1] CRAN (R 4.4.0)
 listenv        0.9.1   2024-01-29 [1] CRAN (R 4.4.0)
 lme4           1.1-36  2025-01-11 [1] CRAN (R 4.4.2)
 lubridate    * 1.9.3   2023-09-27 [1] CRAN (R 4.4.0)
 magrittr     * 2.0.3   2022-03-30 [1] CRAN (R 4.4.0)
 MASS           7.3-61  2024-06-13 [2] CRAN (R 4.4.2)
 Matrix       * 1.7-1   2024-10-18 [2] CRAN (R 4.4.2)
 MatrixModels   0.5-3   2023-11-06 [1] CRAN (R 4.4.0)
 mgcv           1.9-1   2023-12-21 [2] CRAN (R 4.4.2)
 mice         * 3.16.0  2023-06-05 [1] CRAN (R 4.4.0)
 minqa          1.2.7   2024-05-20 [1] CRAN (R 4.4.0)
 mitml          0.4-5   2023-03-08 [1] CRAN (R 4.4.0)
 multcomp       1.4-25  2023-06-20 [1] CRAN (R 4.4.0)
 munsell        0.5.1   2024-04-01 [1] CRAN (R 4.4.0)
 mvtnorm        1.2-4   2023-11-27 [1] CRAN (R 4.4.0)
 nlme           3.1-166 2024-08-14 [2] CRAN (R 4.4.2)
 nloptr         2.0.3   2022-05-26 [1] CRAN (R 4.4.0)
 nnet           7.3-19  2023-05-03 [2] CRAN (R 4.4.2)
 pan            1.9     2023-08-21 [1] CRAN (R 4.4.0)
 parallelly     1.37.1  2024-02-29 [1] CRAN (R 4.4.0)
 patchwork    * 1.2.0   2024-01-08 [1] CRAN (R 4.4.0)
 pillar         1.9.0   2023-03-22 [1] CRAN (R 4.4.0)
 pkgconfig      2.0.3   2019-09-22 [1] CRAN (R 4.4.0)
 polspline      1.1.25  2024-05-10 [1] CRAN (R 4.4.0)
 purrr        * 1.0.2   2023-08-10 [1] CRAN (R 4.4.0)
 quantreg       5.97    2023-08-19 [1] CRAN (R 4.4.0)
 R6             2.5.1   2021-08-19 [1] CRAN (R 4.4.0)
 rbibutils      2.3     2024-10-04 [1] CRAN (R 4.4.1)
 Rcpp           1.0.12  2024-01-09 [1] CRAN (R 4.4.0)
 Rdpack         2.6.1   2024-08-06 [1] CRAN (R 4.4.1)
 readr        * 2.1.5   2024-01-10 [1] CRAN (R 4.4.0)
 reformulas     0.4.0   2024-11-03 [1] CRAN (R 4.4.2)
 rlang          1.1.3   2024-01-10 [1] CRAN (R 4.4.0)
 rmarkdown      2.27    2024-05-17 [1] CRAN (R 4.4.0)
 rms            6.8-1   2024-05-27 [1] CRAN (R 4.4.0)
 rpart          4.1.23  2023-12-05 [2] CRAN (R 4.4.2)
 rstudioapi     0.16.0  2024-03-24 [1] CRAN (R 4.4.0)
 sandwich       3.1-0   2023-12-11 [1] CRAN (R 4.4.0)
 scales         1.3.0   2023-11-28 [1] CRAN (R 4.4.0)
 sessioninfo    1.2.2   2021-12-06 [1] CRAN (R 4.4.0)
 shape          1.4.6.1 2024-02-23 [1] CRAN (R 4.4.0)
 SparseM        1.81    2021-02-18 [1] CRAN (R 4.4.0)
 stringi        1.8.4   2024-05-06 [1] CRAN (R 4.4.0)
 stringr      * 1.5.1   2023-11-14 [1] CRAN (R 4.4.0)
 survival       3.7-0   2024-06-05 [1] CRAN (R 4.4.1)
 TH.data        1.1-2   2023-04-17 [1] CRAN (R 4.4.0)
 tibble       * 3.2.1   2023-03-20 [1] CRAN (R 4.4.0)
 tidyr        * 1.3.1   2024-01-24 [1] CRAN (R 4.4.0)
 tidyselect     1.2.1   2024-03-11 [1] CRAN (R 4.4.0)
 tidyverse    * 2.0.0   2023-02-22 [1] CRAN (R 4.4.0)
 timechange     0.3.0   2024-01-18 [1] CRAN (R 4.4.0)
 tzdb           0.4.0   2023-05-12 [1] CRAN (R 4.4.0)
 utf8           1.2.4   2023-10-22 [1] CRAN (R 4.4.0)
 vctrs          0.6.5   2023-12-01 [1] CRAN (R 4.4.0)
 viridisLite    0.4.2   2023-05-02 [1] CRAN (R 4.4.0)
 withr          3.0.0   2024-01-16 [1] CRAN (R 4.4.0)
 xfun           0.44    2024-05-15 [1] CRAN (R 4.4.0)
 yaml           2.3.8   2023-12-11 [1] CRAN (R 4.4.0)
 zoo            1.8-12  2023-04-13 [1] CRAN (R 4.4.0)

 [1] C:/Users/MichaelK/AppData/Local/R/win-library/4.4
 [2] C:/Program Files/R/R-4.4.2/library

------------------------------------------------------------------------------